# import library
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
# generating data except for gene1,2
# parameters
num_rows = 5000 # number of samples
num_genes = 19998
min_value = 0
max_value = 1.0
n_class = 3
heterogeneity_list = [2, 3, 4, 5] # number of isolatablesubpopulation in population
data_overlap= 0.2 # overlap of spectrum foming gene in each subpopulation
generated_data = np.zeros((num_rows,num_genes, len(heterogeneity_list)))
for k, n_subpopulation in enumerate(heterogeneity_list):
print(f'number of subpopulation: {n_subpopulation}')
base_dist_range = 1./(n_class-(n_class-1)*data_overlap)
base_dist_range2 = 1./(2 - data_overlap)
n_pt_each_sp = num_rows//n_subpopulation
sp_id_for_pt = np.zeros(num_rows)
for k2 in range(n_subpopulation):
sp_id_for_pt[k2*(n_pt_each_sp):((k2+1)*(n_pt_each_sp))] = k2
for pt_number in range(num_rows):
pt_class = pt_number % n_class
min_value_gene_type1 = base_dist_range * pt_class * (1 - data_overlap)
max_value_gene_type1 = base_dist_range * (pt_class * (1 - data_overlap) + 1.)
gene_type1_value = np.random.uniform(min_value_gene_type1, max_value_gene_type1)
pt_class2 = pt_class % 2
min_value_gene_type2 = base_dist_range2 * pt_class2 * (1 - data_overlap)
max_value_gene_type2 = base_dist_range2 * (pt_class2 * (1 - data_overlap) + 1.)
gene_type2_value = np.random.uniform(min_value_gene_type2, max_value_gene_type2)
generated_data[pt_number,:, k ] = np.random.uniform(0,1, size = (num_genes,))
gene_id_type1 = int(0 + sp_id_for_pt[pt_number] *2)
gene_id_type2 = int(1 + sp_id_for_pt[pt_number] *2)
generated_data[pt_number,gene_id_type1, k ] = gene_type1_value
generated_data[pt_number,gene_id_type2, k ] = gene_type2_value
number of subpopulation: 2 number of subpopulation: 3 number of subpopulation: 4 number of subpopulation: 5
pt_class = np.arange(num_rows)%n_class
sns.set()
sns.set_style('white')
sns.set_palette('bright')
fig, ax = plt.subplots(len(heterogeneity_list)+2,len(heterogeneity_list), figsize=(13, 13))
fig.subplots_adjust(hspace=0.8, wspace=0.5)
for k in range(len(heterogeneity_list)):
for k2 in range(max(heterogeneity_list)+1):
ax_tmp=ax[k2,k]
ax_tmp.scatter(generated_data[0::3,0+2*k2,k],generated_data[0::3,1+2*k2,k], s = 1.,alpha =0.5)
ax_tmp.scatter(generated_data[1::3,0+2*k2,k],generated_data[1::3,1+2*k2,k], s = 1.,alpha =0.5)
ax_tmp.scatter(generated_data[2::3,0+2*k2,k],generated_data[2::3,1+2*k2,k], s = 1.,alpha =0.5)
if k2 == 0:
ax_tmp.set_title(f"No. of subpopulation: {heterogeneity_list[k]}")
ax_tmp.set_xlabel(f'gene {1+2*k2}')
ax_tmp.set_ylabel(f'gene {2+2*k2}')
if k == 0:
pass
else:
ax_tmp.tick_params(labelleft=False)
sns.set()
sns.set_style('white')
sns.set_palette('bright')
sp_id_for_pt = np.zeros(num_rows)
for k2 in range(n_subpopulation):
sp_id_for_pt[k2*(n_pt_each_sp):((k2+1)*(n_pt_each_sp))] = k2
for k, n_subpopulation in enumerate(heterogeneity_list):
plt.figure()
fig, ax = plt.subplots(len(heterogeneity_list)+2,max(heterogeneity_list)+1, figsize=(16, 13))
fig.subplots_adjust(hspace=0.8, wspace=0.5)
n_pt_each_sp = num_rows//n_subpopulation
sp_id_for_pt = np.zeros(num_rows)
for k2 in range(max(heterogeneity_list)+1):
ax[k2,0].scatter(generated_data[0::3,0+2*k2,k],generated_data[0::3,1+2*k2,k], s = 1.,alpha =0.5)
ax[k2,0].scatter(generated_data[1::3,0+2*k2,k],generated_data[1::3,1+2*k2,k], s = 1.,alpha =0.5)
ax[k2,0].scatter(generated_data[2::3,0+2*k2,k],generated_data[2::3,1+2*k2,k], s = 1.,alpha =0.5)
ax[k2,0].set_xlabel(f'gene {1+2*k2}')
ax[k2,0].set_ylabel(f'gene {2+2*k2}')
if k2==0:
ax[k2,0].set_title(f"No. of subpopulation: {heterogeneity_list[k]}",fontweight='bold')
for k3 in range(n_subpopulation):
sp_id_for_pt[k3*(n_pt_each_sp):((k3+1)*(n_pt_each_sp))] = k3
for k3 in range(n_subpopulation):
ax[k2,k3+1].scatter(generated_data[(sp_id_for_pt==k3)&(np.arange(0,num_rows)%3==0),0+2*k2,k],
generated_data[(sp_id_for_pt==k3)&(np.arange(0,num_rows)%3==0),1+2*k2,k], s = 1.,alpha =0.5)
ax[k2,k3+1].scatter(generated_data[(sp_id_for_pt==k3)&(np.arange(0,num_rows)%3==1),0+2*k2,k],
generated_data[(sp_id_for_pt==k3)&(np.arange(0,num_rows)%3==1),1+2*k2,k], s = 1.,alpha =0.5)
ax[k2,k3+1].scatter(generated_data[(sp_id_for_pt==k3)&(np.arange(0,num_rows)%3==2),0+2*k2,k],
generated_data[(sp_id_for_pt==k3)&(np.arange(0,num_rows)%3==2),1+2*k2,k], s = 1.,alpha =0.5)
ax[k2,k3+1].set_xlabel(f'gene {1+2*k2}')
ax[k2,k3+1].set_ylabel(f'gene {2+2*k2}')
if k2==0:
ax[k2,k3+1].set_title(f"Subpopulation:{k3+1}")
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
#importing library
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from typing import Optional, List, Tuple
# multiclass_log_loss for LGBM
class MultiLoglossForLGBM:
def __init__(self, n_class: int=3, use_softmax: bool=True, epsilon: float=1e-32, grand_truth=np.empty(0)) -> None:
# initialize
self.name = "SFC_loss"
self.grand_truth = grand_truth
self.n_class = n_class
self.prob_func = self._get_prob_value if use_softmax else lambda x: x
self.epsilon = epsilon
def __call__(self, preds: np.ndarray, labels: np.ndarray, weight: Optional[np.ndarray]=None) -> float:
#calculate loss function
#get prob value by softmax
prob = self.prob_func(preds) # <= from logits to probability
#convert labels to 1-hot
labels = self._get_1hot_label(labels) # <= labels (1D-array) to 1hot
loss_by_sample = np.sum(- np.log(prob) * labels, axis=1)
loss = np.average(loss_by_sample, weight)
return loss
def _calc_grad_and_hess(
self, preds: np.ndarray, labels: np.ndarray, weight: Optional[np.ndarray]=None
) -> Tuple[np.ndarray]:
"""Calc Grad and Hess"""
# # get prob value by softmax
prob = self.prob_func(preds) # <= margin を確率値に直す
# # convert labels to 1-hot
labels = self._get_1hot_label(labels) # <= labels (1D-array) to 1hot label
grad = prob - labels
hess = prob * (1 - prob)
if weight is not None:
grad = grad * weight[:, None]
hess = hess * weight[:, None]
return grad, hess
def return_loss(self, preds: np.ndarray, data: lgb.Dataset) -> Tuple[str, float, bool]:
"""Return Loss for lightgbm"""
labels = data.get_label()
weight = data.get_weight()
n_example = len(labels)
# # reshape preds: (n_class * n_example,) => (n_class, n_example) => (n_example, n_class)
preds = preds.reshape(self.n_class, n_example).T # <= preds (1D-array) to 2D-array
# # calc loss
loss = self(preds, labels, weight)
return self.name, loss, False
def return_grad_and_hess(self, preds: np.ndarray, data: lgb.Dataset) -> Tuple[np.ndarray]:
"""Return Grad and Hess for lightgbm"""
labels = data.get_label()
weight = data.get_weight()
n_example = len(labels)
# # reshape preds: (n_class * n_example,) => (n_class, n_example) => (n_example, n_class)
preds = preds.reshape(self.n_class, n_example).T # <= preds (1D-array) to 2D-array
# # calc grad and hess.
grad, hess = self._calc_grad_and_hess(preds, labels, weight)
# # reshape grad, hess: (n_example, n_class) => (n_class, n_example) => (n_class * n_example,)
grad = grad.T.reshape(n_example * self.n_class) # <= return 1D-array
hess = hess.T.reshape(n_example * self.n_class) # <= return 1D-array
return grad, hess
def softmax(x):
return np.exp(x)/np.sum(np.exp(x))
def _get_prob_value(self, preds: np.ndarray) -> np.ndarray:
"""Convert Margin(Logit) to Prob by Softmax."""
upper = np.exp(preds)
prob = upper / np.sum(upper, axis=1, keepdims=True)
prob = np.clip(prob, self.epsilon, 1 - self.epsilon)
return prob
def _get_1hot_label(self, labels: np.ndarray) -> np.ndarray:
"""Convert labels to 1hot array."""
n_example = len(labels)
#make a matrix here
onehot = np.zeros((n_example, self.n_class))
#setting overlap
original_array=self.grand_truth
for index, j in enumerate(labels):
if self.grand_truth.shape[0]==0:
onehot[index, int(j)] =1
else:
onehot[index,:]=original_array[int(j)]
return onehot
# annotating
data_df_test = pd.DataFrame(generated_data[:,:,-1])
data_df_test['patient'] = pt_class
# data split and standardalization
x_gbm = data_df_test.drop('patient', axis = 1).values
y_gbm = pt_class #data_df_test['patient'].values
sc = StandardScaler()
sc.fit(x_gbm)
x_gbm = sc.transform(x_gbm)
# small scale data prep for hyper parameter tuning by optuna
x_gbm_small = x_gbm[::23,:]
y_gbm_small = y_gbm[::23]
n_splits = 5
# parameters optimization using optuna
def objective(trial):
# Set LightGBM hyperparameters using Optuna
param = {
'objective': 'multiclass',
'metric': 'multi_logloss',
'verbosity': -1,
'boosting_type': 'gbdt',
'num_class': 3,
'num_iteration': 500,
'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
'num_leaves': trial.suggest_int('num_leaves', 2, 256),
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
}
# Set 5-fold cross-validation
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state = 2025)
# List to save cross-validation results
cv_results = []
# Train and evaluate the model for each fold
for train_idx, valid_idx in cv.split(x_gbm_small, y_gbm_small):
X_train, X_valid = x_gbm[train_idx], x_gbm[valid_idx]
y_train, y_valid = y_gbm[train_idx], y_gbm[valid_idx]
lgb_train = lgb.Dataset(X_train, label = y_train)
lgb_valid = lgb.Dataset(X_valid, label = y_valid, reference = lgb_train)
gbm = lgb.train(param,
lgb_train,
valid_sets = [lgb_train, lgb_valid],
early_stopping_rounds = 70,
verbose_eval = False)
y_pred = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)
cv_results.append(accuracy_score(y_valid, y_pred_max))
# Return the minimum accuracy from cross-validation
return np.mean(cv_results)
# Create an Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
# Output the best hyperparameters
print('Best trial:')
trial = study.best_trial
print(' Value: {}'.format(trial.value))
print(' Params: ')
for key, value in trial.params.items():
print(' {}: {}'.format(key, value))
[I 2025-06-12 09:05:16,631] A new study created in memory with name: no-name-3fac61bc-f6b9-4c3a-9777-45c23081c3d9
[I 2025-06-12 09:07:17,723] Trial 0 finished with value: 0.9266384778012684 and parameters: {'lambda_l1': 2.6102331334792888e-06, 'lambda_l2': 6.927944956223782e-08, 'num_leaves': 57, 'feature_fraction': 0.6360833207498244, 'bagging_fraction': 0.901332947499227, 'bagging_freq': 6, 'min_child_samples': 27, 'learning_rate': 0.001945632688687439}. Best is trial 0 with value: 0.9266384778012684.
[I 2025-06-12 09:09:09,805] Trial 1 finished with value: 0.931183932346723 and parameters: {'lambda_l1': 6.524874907751178e-06, 'lambda_l2': 0.04480821831058013, 'num_leaves': 34, 'feature_fraction': 0.6179404556780297, 'bagging_fraction': 0.9826203017591404, 'bagging_freq': 4, 'min_child_samples': 25, 'learning_rate': 0.0030535055305313755}. Best is trial 1 with value: 0.931183932346723.
[I 2025-06-12 09:09:17,916] Trial 2 finished with value: 0.3256871035940804 and parameters: {'lambda_l1': 1.2275308672294275e-06, 'lambda_l2': 1.113106036754439e-05, 'num_leaves': 104, 'feature_fraction': 0.6167721716057784, 'bagging_fraction': 0.5935063088393333, 'bagging_freq': 5, 'min_child_samples': 65, 'learning_rate': 0.00030421877745556925}. Best is trial 1 with value: 0.931183932346723.
[I 2025-06-12 09:11:29,092] Trial 3 finished with value: 0.9449260042283297 and parameters: {'lambda_l1': 6.508062206551498e-06, 'lambda_l2': 0.0006236522634875327, 'num_leaves': 13, 'feature_fraction': 0.9667312069218028, 'bagging_fraction': 0.5452066813787355, 'bagging_freq': 4, 'min_child_samples': 18, 'learning_rate': 0.0001924913572221853}. Best is trial 3 with value: 0.9449260042283297.
[I 2025-06-12 09:14:56,181] Trial 4 finished with value: 0.9127906976744186 and parameters: {'lambda_l1': 0.12286670402932784, 'lambda_l2': 8.302093507935926e-06, 'num_leaves': 77, 'feature_fraction': 0.8928216519370031, 'bagging_fraction': 0.9634325925390339, 'bagging_freq': 2, 'min_child_samples': 9, 'learning_rate': 6.431558779007063e-05}. Best is trial 3 with value: 0.9449260042283297.
[I 2025-06-12 09:14:57,408] Trial 5 finished with value: 0.3256871035940804 and parameters: {'lambda_l1': 8.165191165946453e-08, 'lambda_l2': 0.053355840392958195, 'num_leaves': 223, 'feature_fraction': 0.8441723056591293, 'bagging_fraction': 0.8284718962790149, 'bagging_freq': 3, 'min_child_samples': 93, 'learning_rate': 1.821031979414641e-05}. Best is trial 3 with value: 0.9449260042283297.
[I 2025-06-12 09:16:05,327] Trial 6 finished with value: 0.9541226215644821 and parameters: {'lambda_l1': 0.0010594837308910087, 'lambda_l2': 2.4842665723411326, 'num_leaves': 105, 'feature_fraction': 0.6915116579516776, 'bagging_fraction': 0.9391048277449666, 'bagging_freq': 7, 'min_child_samples': 49, 'learning_rate': 0.022919437340785314}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:18:08,229] Trial 7 finished with value: 0.9357293868921776 and parameters: {'lambda_l1': 3.8254972097593943e-07, 'lambda_l2': 4.544651235221341e-05, 'num_leaves': 225, 'feature_fraction': 0.5395931966156191, 'bagging_fraction': 0.9544797029865358, 'bagging_freq': 5, 'min_child_samples': 25, 'learning_rate': 0.0015220001668849275}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:18:09,520] Trial 8 finished with value: 0.3256871035940804 and parameters: {'lambda_l1': 8.424691965307376e-07, 'lambda_l2': 5.768027168244033, 'num_leaves': 110, 'feature_fraction': 0.8396039684572938, 'bagging_fraction': 0.825776366981839, 'bagging_freq': 5, 'min_child_samples': 89, 'learning_rate': 3.30817826949166e-05}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:18:17,833] Trial 9 finished with value: 0.3256871035940804 and parameters: {'lambda_l1': 2.8425708192121057e-07, 'lambda_l2': 0.07891699419964239, 'num_leaves': 106, 'feature_fraction': 0.5189780010342828, 'bagging_fraction': 0.7490452417705927, 'bagging_freq': 7, 'min_child_samples': 85, 'learning_rate': 2.1070793994181373e-05}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:18:26,282] Trial 10 finished with value: 0.3256871035940804 and parameters: {'lambda_l1': 0.009163824867307543, 'lambda_l2': 6.218064290289853, 'num_leaves': 164, 'feature_fraction': 0.7701283456767841, 'bagging_fraction': 0.43309250830072665, 'bagging_freq': 1, 'min_child_samples': 54, 'learning_rate': 0.06558759247444836}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:18:54,159] Trial 11 finished with value: 0.9402748414376321 and parameters: {'lambda_l1': 0.0002839248959508303, 'lambda_l2': 0.001312799983205981, 'num_leaves': 2, 'feature_fraction': 0.9864477340273684, 'bagging_fraction': 0.5819857181082826, 'bagging_freq': 7, 'min_child_samples': 38, 'learning_rate': 0.043529058158602314}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:19:02,895] Trial 12 finished with value: 0.3256871035940804 and parameters: {'lambda_l1': 7.441440640078801, 'lambda_l2': 7.203791337650922e-08, 'num_leaves': 160, 'feature_fraction': 0.7281796197366558, 'bagging_fraction': 0.603546241553344, 'bagging_freq': 3, 'min_child_samples': 70, 'learning_rate': 0.01285778709308108}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:20:43,896] Trial 13 finished with value: 0.9310782241014799 and parameters: {'lambda_l1': 0.00011269335787126489, 'lambda_l2': 0.0008054537072297728, 'num_leaves': 5, 'feature_fraction': 0.41196928822782114, 'bagging_fraction': 0.4205273926654217, 'bagging_freq': 4, 'min_child_samples': 9, 'learning_rate': 0.00025374018565539055}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:21:25,704] Trial 14 finished with value: 0.9221987315010571 and parameters: {'lambda_l1': 0.007464517364976608, 'lambda_l2': 0.6858475857334031, 'num_leaves': 167, 'feature_fraction': 0.9920090220867162, 'bagging_fraction': 0.5101800263729338, 'bagging_freq': 6, 'min_child_samples': 44, 'learning_rate': 0.009545648741291752}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:22:02,558] Trial 15 finished with value: 0.8394291754756871 and parameters: {'lambda_l1': 2.15612636641526e-05, 'lambda_l2': 0.0054980511177335056, 'num_leaves': 55, 'feature_fraction': 0.766693639087492, 'bagging_fraction': 0.6808241405077038, 'bagging_freq': 3, 'min_child_samples': 58, 'learning_rate': 0.00032992573417682395}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:22:54,346] Trial 16 finished with value: 0.9036997885835095 and parameters: {'lambda_l1': 0.002700119348834772, 'lambda_l2': 1.0360451581664096e-06, 'num_leaves': 141, 'feature_fraction': 0.6807090765477425, 'bagging_fraction': 0.7097785939616645, 'bagging_freq': 6, 'min_child_samples': 41, 'learning_rate': 9.898865373032181e-05}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:23:03,077] Trial 17 finished with value: 0.3256871035940804 and parameters: {'lambda_l1': 3.125512910320147e-08, 'lambda_l2': 0.6054042534154945, 'num_leaves': 195, 'feature_fraction': 0.9128637703511097, 'bagging_fraction': 0.5184435769220397, 'bagging_freq': 1, 'min_child_samples': 77, 'learning_rate': 0.01004421189547668}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:24:08,093] Trial 18 finished with value: 0.945031712473573 and parameters: {'lambda_l1': 0.23317453248776052, 'lambda_l2': 0.0001851669180821967, 'num_leaves': 253, 'feature_fraction': 0.535773521551884, 'bagging_fraction': 0.8066577973544067, 'bagging_freq': 4, 'min_child_samples': 18, 'learning_rate': 0.02867353093236123}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:24:54,607] Trial 19 finished with value: 0.9405919661733615 and parameters: {'lambda_l1': 0.3229334373668929, 'lambda_l2': 4.9892934953135836e-05, 'num_leaves': 245, 'feature_fraction': 0.43037862758866735, 'bagging_fraction': 0.843749232478472, 'bagging_freq': 2, 'min_child_samples': 32, 'learning_rate': 0.03363812687833878}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:24:55,767] Trial 20 finished with value: 0.3256871035940804 and parameters: {'lambda_l1': 0.2673577587746034, 'lambda_l2': 0.005525983592135059, 'num_leaves': 256, 'feature_fraction': 0.5250067788050102, 'bagging_fraction': 0.9031354227600373, 'bagging_freq': 7, 'min_child_samples': 100, 'learning_rate': 0.0213716885167339}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:25:39,239] Trial 21 finished with value: 0.9449260042283297 and parameters: {'lambda_l1': 7.510719663909653, 'lambda_l2': 0.00019537738888203987, 'num_leaves': 191, 'feature_fraction': 0.5602722621866028, 'bagging_fraction': 0.762591101991334, 'bagging_freq': 4, 'min_child_samples': 17, 'learning_rate': 0.09847960096293353}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:27:22,718] Trial 22 finished with value: 0.9266384778012684 and parameters: {'lambda_l1': 0.0012651780671589486, 'lambda_l2': 8.958479982350027e-07, 'num_leaves': 31, 'feature_fraction': 0.476402534693228, 'bagging_fraction': 0.659716688569129, 'bagging_freq': 5, 'min_child_samples': 17, 'learning_rate': 0.003567669909461695}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:32:31,250] Trial 23 finished with value: 0.9312896405919661 and parameters: {'lambda_l1': 5.454450154895173e-05, 'lambda_l2': 1.0296077488360555e-08, 'num_leaves': 84, 'feature_fraction': 0.6989275431226816, 'bagging_fraction': 0.8773987938142698, 'bagging_freq': 4, 'min_child_samples': 5, 'learning_rate': 0.0004684263035408594}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:33:31,123] Trial 24 finished with value: 0.9495771670190274 and parameters: {'lambda_l1': 0.02812186173785912, 'lambda_l2': 0.010182651712090198, 'num_leaves': 133, 'feature_fraction': 0.7874882072939549, 'bagging_fraction': 0.7955681430303243, 'bagging_freq': 3, 'min_child_samples': 46, 'learning_rate': 0.004674245733045683}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:34:21,570] Trial 25 finished with value: 0.945031712473573 and parameters: {'lambda_l1': 0.04581307100286669, 'lambda_l2': 0.009978790197137773, 'num_leaves': 129, 'feature_fraction': 0.7689168158310726, 'bagging_fraction': 0.7774371064362486, 'bagging_freq': 2, 'min_child_samples': 48, 'learning_rate': 0.0058519708039009525}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:35:03,938] Trial 26 finished with value: 0.908245243128964 and parameters: {'lambda_l1': 0.035294447674446346, 'lambda_l2': 0.5807931318948669, 'num_leaves': 201, 'feature_fraction': 0.8355338993620934, 'bagging_fraction': 0.7913091898549621, 'bagging_freq': 3, 'min_child_samples': 61, 'learning_rate': 0.01599251494130772}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:35:52,437] Trial 27 finished with value: 0.9495771670190274 and parameters: {'lambda_l1': 0.6112861836124636, 'lambda_l2': 0.15384281599684868, 'num_leaves': 136, 'feature_fraction': 0.5872531609173969, 'bagging_fraction': 0.9410793941160366, 'bagging_freq': 3, 'min_child_samples': 50, 'learning_rate': 0.027208890400951664}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:36:56,130] Trial 28 finished with value: 0.9127906976744186 and parameters: {'lambda_l1': 1.3618107243565343, 'lambda_l2': 0.15974902159922666, 'num_leaves': 131, 'feature_fraction': 0.6453841188326073, 'bagging_fraction': 0.935356299937701, 'bagging_freq': 2, 'min_child_samples': 50, 'learning_rate': 0.0008223744380805984}. Best is trial 6 with value: 0.9541226215644821.
[I 2025-06-12 09:38:16,372] Trial 29 finished with value: 0.9266384778012684 and parameters: {'lambda_l1': 0.001313160491163887, 'lambda_l2': 2.121396694012517, 'num_leaves': 84, 'feature_fraction': 0.5795769040307805, 'bagging_fraction': 0.9104381089850295, 'bagging_freq': 3, 'min_child_samples': 33, 'learning_rate': 0.003891653698541711}. Best is trial 6 with value: 0.9541226215644821.
Best trial:
Value: 0.9541226215644821
Params:
lambda_l1: 0.0010594837308910087
lambda_l2: 2.4842665723411326
num_leaves: 105
feature_fraction: 0.6915116579516776
bagging_fraction: 0.9391048277449666
bagging_freq: 7
min_child_samples: 49
learning_rate: 0.022919437340785314
# divide dataset
n_splits = 5
feature_names = [f"gene {i+1}" for i in range(num_genes)]
CV_names = [f"CV{i}" for i in range(n_splits)]
heteroN = [f"heteroN {i}" for i in heterogeneity_list]
df_importance_GC_mean = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_GC_std = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_SFC_mean = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_SFC_std = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_Diff_mean = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_Diff_std = pd.DataFrame(columns = heteroN, index = feature_names)
# lightBGM hyper parameter
best_params = trial.params.copy()
best_params.update({
'objective': 'multiclass',
'metric': 'multi_logloss',
'boosting_type': 'gbdt',
'num_class': 3,
'num_iteration': 500,
'verbosity': -1,
'early_stopping_rounds': 100,
'importance_type': 'gain',
'seed': 42
})
# hyperparameter setting for overlap assumption in the model
overlap_w = 0.05
grand_truth_SFC = np.array([[1.-overlap_w,overlap_w,0],[overlap_w,1.-2*overlap_w,overlap_w],[0,overlap_w,1.-overlap_w]])
# placeholder for data to be obtained
array_acc = np.zeros((2,n_splits,len(heterogeneity_list)))
array_importance_GC= np.zeros((num_genes,n_splits,len(heterogeneity_list)))
array_importance_SFC = np.zeros((num_genes,n_splits,len(heterogeneity_list)))
array_importance_Diff = np.zeros((num_genes,n_splits,len(heterogeneity_list)))
for spN_id, n_subpopulation in tqdm(enumerate(heterogeneity_list), total=len(heterogeneity_list)):
df_acc = pd.DataFrame(columns=CV_names, index=['GC','SFC'])
df_importance_GC = pd.DataFrame(columns=CV_names, index=feature_names)
df_importance_SFC = pd.DataFrame(columns=CV_names, index=feature_names)
df_importance_Diff = pd.DataFrame(columns=CV_names, index=feature_names)
# annotating
data_df_test = pd.DataFrame(generated_data[:,:,spN_id])
data_df_test['patient'] = pt_class
# data split and standardalization
x_gbm = data_df_test.drop('patient', axis = 1).values
y_gbm = pt_class #data_df_test['patient'].values
sc = StandardScaler()
sc.fit(x_gbm)
x_gbm = sc.transform(x_gbm)
x_train_gbm, x_test_gbm, y_train_gbm, y_test_gbm = train_test_split(x_gbm, y_gbm, test_size=0.2, shuffle = True, random_state=2022, stratify=y_gbm)
# val data for classification
x_tr, x_va, y_tr, y_va = train_test_split(x_train_gbm, y_train_gbm, test_size=0.2, shuffle = True, random_state=2022, stratify=y_train_gbm)
cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2025).split(x_gbm, y_gbm))
# CV loop
for nfold, (train_index, valid_index) in enumerate(cv):
print("-"*20, f"subpopID: {spN_id}, cv: {nfold}", "-"*20)
x_tr, y_tr = x_gbm[train_index], y_gbm[train_index]
x_va, y_va = x_gbm[valid_index], y_gbm[valid_index]
# define lgb dataset
lgb_train = lgb.Dataset(x_tr, label = y_tr)
lgb_eval = lgb.Dataset(x_va, label = y_va, reference = lgb_train)
print("-"*20, "GC model learning", "-"*20)
# model training
evaluation_results={}
model_GC = lgb.train(best_params,
train_set=lgb_train,
valid_names=['train', 'valid'],
valid_sets=[lgb_train, lgb_eval],
evals_result=evaluation_results,
early_stopping_rounds = 70,
verbose_eval=False,
)
# calculate accuracy of test data
y_pred_prob = model_GC.predict(x_va, num_iteration = model_GC.best_iteration)
y_pred_GC = np.argmax(y_pred_prob,axis=1)
acc_GC = accuracy_score(y_va, y_pred_GC)
print(f"Test data accuracy: {acc_GC}")
# model training
print("-"*20, "SFC model learning", "-"*20)
evaluation_results2={}
SFC_loss = MultiLoglossForLGBM(n_class = 3, grand_truth=grand_truth_SFC, use_softmax = True)
model_SFC = lgb.train(best_params,
train_set=lgb_train,
valid_names=['train', 'valid'],
valid_sets=[lgb_train, lgb_eval],
evals_result=evaluation_results2,
fobj=SFC_loss.return_grad_and_hess,
feval=lambda preds, data: SFC_loss.return_loss(preds, data),
early_stopping_rounds = 70,
verbose_eval=False,
)
# calculate accuracy of test data
y_pred_prob_SFC = model_SFC.predict(x_va, num_iteration = model_SFC.best_iteration)
y_pred_SFC = np.argmax(y_pred_prob_SFC,axis=1)
acc_SFC = accuracy_score(y_va, y_pred_SFC)
print(f"Test data accuracy: {acc_SFC}")
# gain features
f_importance_GC = np.array(model_GC.feature_importance(importance_type='gain'))
f_importance_GC = f_importance_GC / np.sum(f_importance_GC)
f_importance_SFC = np.array(model_SFC.feature_importance(importance_type='gain'))
f_importance_SFC = f_importance_SFC / np.sum(f_importance_SFC)
df_acc.at['GC',CV_names[nfold]] = acc_GC
df_acc.at['SFC',CV_names[nfold]] = acc_SFC
df_importance_GC[CV_names[nfold]] = f_importance_GC
df_importance_SFC[CV_names[nfold]] = f_importance_SFC
df_importance_Diff[CV_names[nfold]] = f_importance_SFC - f_importance_GC
array_acc[:,:,spN_id] = df_acc.to_numpy()
array_importance_GC[:,:,spN_id] = df_importance_GC.to_numpy()
array_importance_SFC[:,:,spN_id]= df_importance_SFC.to_numpy()
array_importance_Diff[:,:,spN_id]= df_importance_Diff.to_numpy()
acc_th = 2./3.
df_importance_GC_mean[heteroN[spN_id]] = df_importance_GC.loc[:,(df_acc>=acc_th).sum(axis=0)==2].mean(axis=1)
df_importance_GC_std[heteroN[spN_id]] = df_importance_GC.loc[:,(df_acc>=acc_th).sum(axis=0)==2].std(axis=1)
df_importance_SFC_mean[heteroN[spN_id]]= df_importance_SFC.loc[:,(df_acc>=acc_th).sum(axis=0)==2].mean(axis=1)
df_importance_SFC_std[heteroN[spN_id]] = df_importance_SFC.loc[:,(df_acc>=acc_th).sum(axis=0)==2].std(axis=1)
df_importance_Diff_mean[heteroN[spN_id]]= df_importance_Diff.loc[:,(df_acc>=acc_th).sum(axis=0)==2].mean(axis=1)
df_importance_Diff_std[heteroN[spN_id]] = df_importance_Diff.loc[:,(df_acc>=acc_th).sum(axis=0)==2].std(axis=1)
del df_acc,df_importance_GC,df_importance_SFC,data_df_test
0%| | 0/4 [00:00<?, ?it/s]
-------------------- subpopID: 0, cv: 0 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.789 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.782 -------------------- subpopID: 0, cv: 1 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.769 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.771 -------------------- subpopID: 0, cv: 2 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.773 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.78 -------------------- subpopID: 0, cv: 3 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.784 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.782 -------------------- subpopID: 0, cv: 4 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.785 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type
25%|█████████▊ | 1/4 [1:03:56<3:11:48, 3836.21s/it]
Test data accuracy: 0.782 -------------------- subpopID: 1, cv: 0 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.677 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.668 -------------------- subpopID: 1, cv: 1 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.69 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.679 -------------------- subpopID: 1, cv: 2 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.681 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.681 -------------------- subpopID: 1, cv: 3 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.665 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.653 -------------------- subpopID: 1, cv: 4 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.666 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type
50%|███████████████████▌ | 2/4 [2:11:36<2:12:15, 3967.79s/it]
Test data accuracy: 0.667 -------------------- subpopID: 2, cv: 0 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.589 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.585 -------------------- subpopID: 2, cv: 1 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.622 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.604 -------------------- subpopID: 2, cv: 2 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.603 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.601 -------------------- subpopID: 2, cv: 3 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.628 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.609 -------------------- subpopID: 2, cv: 4 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.601 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type
75%|█████████████████████████████▎ | 3/4 [3:22:36<1:08:21, 4101.48s/it]
Test data accuracy: 0.596 -------------------- subpopID: 3, cv: 0 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.582 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.557 -------------------- subpopID: 3, cv: 1 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.567 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.55 -------------------- subpopID: 3, cv: 2 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.588 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.561 -------------------- subpopID: 3, cv: 3 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.563 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.551 -------------------- subpopID: 3, cv: 4 -------------------- -------------------- GC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type Test data accuracy: 0.581 -------------------- SFC model learning -------------------- [LightGBM] [Warning] Unknown parameter: importance_type
100%|█████████████████████████████████████████| 4/4 [4:33:36<00:00, 4104.02s/it]
Test data accuracy: 0.539
df_importance_Diff_mean
| heteroN 2 | heteroN 3 | heteroN 4 | heteroN 5 | |
|---|---|---|---|---|
| gene 1 | 0.011415 | 0.024208 | NaN | NaN |
| gene 2 | -0.003925 | 0.007570 | NaN | NaN |
| gene 3 | 0.020932 | 0.026486 | NaN | NaN |
| gene 4 | -0.000902 | 0.010305 | NaN | NaN |
| gene 5 | 0.000012 | 0.019161 | NaN | NaN |
| ... | ... | ... | ... | ... |
| gene 19994 | -0.000014 | -0.000008 | NaN | NaN |
| gene 19995 | -0.000013 | 0.000011 | NaN | NaN |
| gene 19996 | 0.000004 | -0.000005 | NaN | NaN |
| gene 19997 | 0.000002 | -0.000026 | NaN | NaN |
| gene 19998 | 0.000004 | -0.000013 | NaN | NaN |
19998 rows × 4 columns
# place blank dataframe
df_gene1_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene1_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene2_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene2_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene3_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene3_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene4_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene4_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene5_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene5_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene6_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene6_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene7_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene7_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene8_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene8_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene9_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene9_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene10_mean = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
df_gene10_std = pd.DataFrame(index=df_importance_SFC_mean.columns, columns=['GC', 'SFC'])
# data insertion
df_gene1_mean['GC']=df_importance_GC_mean.loc['gene 1',:]
df_gene1_std['GC']=df_importance_GC_std.loc['gene 1',:]
df_gene1_mean['SFC']=df_importance_SFC_mean.loc['gene 1',:]
df_gene1_std['SFC']=df_importance_SFC_std.loc['gene 1',:]
df_gene2_mean['GC']=df_importance_GC_mean.loc['gene 2',:]
df_gene2_std['GC']=df_importance_GC_std.loc['gene 2',:]
df_gene2_mean['SFC']=df_importance_SFC_mean.loc['gene 2',:]
df_gene2_std['SFC']=df_importance_SFC_std.loc['gene 2',:]
df_gene3_mean['GC']=df_importance_GC_mean.loc['gene 3',:]
df_gene3_std['GC']=df_importance_GC_std.loc['gene 3',:]
df_gene3_mean['SFC']=df_importance_SFC_mean.loc['gene 3',:]
df_gene3_std['SFC']=df_importance_SFC_std.loc['gene 3',:]
df_gene4_mean['GC']=df_importance_GC_mean.loc['gene 4',:]
df_gene4_std['GC']=df_importance_GC_std.loc['gene 4',:]
df_gene4_mean['SFC']=df_importance_SFC_mean.loc['gene 4',:]
df_gene4_std['SFC']=df_importance_SFC_std.loc['gene 4',:]
df_gene5_mean['GC']=df_importance_GC_mean.loc['gene 5',:]
df_gene5_std['GC']=df_importance_GC_std.loc['gene 5',:]
df_gene5_mean['SFC']=df_importance_SFC_mean.loc['gene 5',:]
df_gene5_std['SFC']=df_importance_SFC_std.loc['gene 5',:]
df_gene6_mean['GC']=df_importance_GC_mean.loc['gene 6',:]
df_gene6_std['GC']=df_importance_GC_std.loc['gene 6',:]
df_gene6_mean['SFC']=df_importance_SFC_mean.loc['gene 6',:]
df_gene6_std['SFC']=df_importance_SFC_std.loc['gene 6',:]
df_gene7_mean['GC']=df_importance_GC_mean.loc['gene 7',:]
df_gene7_std['GC']=df_importance_GC_std.loc['gene 7',:]
df_gene7_mean['SFC']=df_importance_SFC_mean.loc['gene 7',:]
df_gene7_std['SFC']=df_importance_SFC_std.loc['gene 7',:]
df_gene8_mean['GC']=df_importance_GC_mean.loc['gene 8',:]
df_gene8_std['GC']=df_importance_GC_std.loc['gene 8',:]
df_gene8_mean['SFC']=df_importance_SFC_mean.loc['gene 8',:]
df_gene8_std['SFC']=df_importance_SFC_std.loc['gene 8',:]
df_gene9_mean['GC']=df_importance_GC_mean.loc['gene 9',:]
df_gene9_std['GC']=df_importance_GC_std.loc['gene 9',:]
df_gene9_mean['SFC']=df_importance_SFC_mean.loc['gene 9',:]
df_gene9_std['SFC']=df_importance_SFC_std.loc['gene 9',:]
df_gene10_mean['GC']=df_importance_GC_mean.loc['gene 10',:]
df_gene10_std['GC']=df_importance_GC_std.loc['gene 10',:]
df_gene10_mean['SFC']=df_importance_SFC_mean.loc['gene 10',:]
df_gene10_std['SFC']=df_importance_SFC_std.loc['gene 10',:]
# plot
fig, ax = plt.subplots(5,2, figsize=(10,15))
fig.subplots_adjust(hspace=0.8, wspace=0.5)
df_gene1_mean.plot.bar(ax=ax[0,0],yerr=df_gene1_std,capsize = 2., title='gene 1', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene2_mean.plot.bar(ax=ax[0,1],yerr=df_gene2_std,capsize = 2., title='gene 2', ylim=([0, 0.3]))
df_gene3_mean.plot.bar(ax=ax[1,0],yerr=df_gene3_std,capsize = 2., title='gene 3', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene4_mean.plot.bar(ax=ax[1,1],yerr=df_gene4_std,capsize = 2., title='gene 4', ylim=([0, 0.3]))
df_gene5_mean.plot.bar(ax=ax[2,0],yerr=df_gene5_std,capsize = 2., title='gene 5', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene6_mean.plot.bar(ax=ax[2,1],yerr=df_gene6_std,capsize = 2., title='gene 6', ylim=([0, 0.3]))
df_gene7_mean.plot.bar(ax=ax[3,0],yerr=df_gene7_std,capsize = 2., title='gene 7', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene8_mean.plot.bar(ax=ax[3,1],yerr=df_gene8_std,capsize = 2., title='gene 8', ylim=([0, 0.3]))
df_gene9_mean.plot.bar(ax=ax[4,0],yerr=df_gene9_std,capsize = 2., title='gene 9', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene10_mean.plot.bar(ax=ax[4,1],yerr=df_gene10_std,capsize = 2., title='gene 10', ylim=([0, 0.3]))
plt.show()
sns.set_palette(['#000000', '#ABABAB'])
# place blank dataframe
# plot
fig, ax = plt.subplots(5,2, figsize=(10,15))
fig.subplots_adjust(hspace=0.8, wspace=0.5)
df_gene1_mean.plot.bar(ax=ax[0,0],yerr=df_gene1_std,capsize = 2., title='gene 1', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene2_mean.plot.bar(ax=ax[0,1],yerr=df_gene2_std,capsize = 2., title='gene 2', ylim=([0, 0.3]))
df_gene3_mean.plot.bar(ax=ax[1,0],yerr=df_gene3_std,capsize = 2., title='gene 3', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene4_mean.plot.bar(ax=ax[1,1],yerr=df_gene4_std,capsize = 2., title='gene 4', ylim=([0, 0.3]))
df_gene5_mean.plot.bar(ax=ax[2,0],yerr=df_gene5_std,capsize = 2., title='gene 5', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene6_mean.plot.bar(ax=ax[2,1],yerr=df_gene6_std,capsize = 2., title='gene 6', ylim=([0, 0.3]))
df_gene7_mean.plot.bar(ax=ax[3,0],yerr=df_gene7_std,capsize = 2., title='gene 7', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene8_mean.plot.bar(ax=ax[3,1],yerr=df_gene8_std,capsize = 2., title='gene 8', ylim=([0, 0.3]))
df_gene9_mean.plot.bar(ax=ax[4,0],yerr=df_gene9_std,capsize = 2., title='gene 9', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene10_mean.plot.bar(ax=ax[4,1],yerr=df_gene10_std,capsize = 2., title='gene 10', ylim=([0, 0.3]))
plt.show()
sns.set_palette('bright')
heterogeneity_list
fig,ax=plt.subplots(1,len(heterogeneity_list), figsize=(14,3))
fig.subplots_adjust(hspace=1.0, wspace=0.5)
for k, hetN in enumerate(heterogeneity_list):
df_importance_Diff_mean.loc['gene 1':'gene 10',f"heteroN {hetN}"].T.plot.bar(ax=ax[k],yerr=df_importance_Diff_std.loc['gene 1':'gene 10',f"heteroN {hetN}"].T,capsize = 2.,
title= f"No. of subpoulation: {hetN}",
ylabel='feature inportance'if k==0 else None )
acc_th = (1./3.)*1.5
df_importance_GC_mean2 = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_GC_std2 = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_SFC_mean2 = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_SFC_std2 = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_Diff_mean2 = pd.DataFrame(columns = heteroN, index = feature_names)
df_importance_Diff_std2 = pd.DataFrame(columns = heteroN, index = feature_names)
# df_acc2 = pd.DataFrame(columns=CV_names, index=['GC','SFC'])
# df_importance_GC2 = pd.DataFrame(columns=CV_names, index=feature_names)
# df_importance_SFC2 = pd.DataFrame(columns=CV_names, index=feature_names)
# df_importance_Diff2 = pd.DataFrame(columns=CV_names, index=feature_names)
for spN_id, n_subpopulation in tqdm(enumerate(heterogeneity_list), total=len(heterogeneity_list)):
print(spN_id)
df_acc2 = pd.DataFrame(array_acc[:,:,spN_id],columns=CV_names, index=['GC','SFC'])
df_importance_GC2 = pd.DataFrame(array_importance_GC[:,:,spN_id], columns=CV_names, index=feature_names)
df_importance_SFC2 = pd.DataFrame(array_importance_SFC[:,:,spN_id], columns=CV_names, index=feature_names)
df_importance_Diff2 = pd.DataFrame(array_importance_Diff[:,:,spN_id], columns=CV_names, index=feature_names)
df_importance_GC_mean2[heteroN[spN_id]] = df_importance_GC2.loc[:,(df_acc2>=acc_th).sum(axis=0)==2].mean(axis=1)
df_importance_GC_std2[heteroN[spN_id]] = df_importance_GC2.loc[:,(df_acc2>=acc_th).sum(axis=0)==2].std(axis=1)
df_importance_SFC_mean2[heteroN[spN_id]]= df_importance_SFC2.loc[:,(df_acc2>=acc_th).sum(axis=0)==2].mean(axis=1)
df_importance_SFC_std2[heteroN[spN_id]] = df_importance_SFC2.loc[:,(df_acc2>=acc_th).sum(axis=0)==2].std(axis=1)
df_importance_Diff_mean2[heteroN[spN_id]]= df_importance_Diff2.loc[:,(df_acc2>=acc_th).sum(axis=0)==2].mean(axis=1)
df_importance_Diff_std2[heteroN[spN_id]] = df_importance_Diff2.loc[:,(df_acc2>=acc_th).sum(axis=0)==2].std(axis=1)
del df_acc2,df_importance_GC2,df_importance_SFC2
100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 38.12it/s]
0 1 2 3
heteroN[spN_id]
'heteroN 5'
array_importance_GC[:,:,3]
array([[2.56766384e-02, 2.10548628e-02, 2.28991792e-02, 2.42945145e-02,
1.97377566e-02],
[1.52861471e-02, 1.35842414e-02, 1.26559236e-02, 1.34901082e-02,
1.36762746e-02],
[2.51851167e-02, 2.30716284e-02, 2.62886434e-02, 2.84761256e-02,
2.45034440e-02],
...,
[8.66751279e-06, 1.34037247e-05, 1.07056636e-06, 2.93074529e-05,
3.29199604e-05],
[9.63683761e-06, 1.41965850e-05, 2.52578924e-05, 3.02971152e-05,
8.48717338e-06],
[5.73258189e-05, 1.81789000e-05, 9.92141912e-06, 0.00000000e+00,
2.57747340e-05]])
df_importance_SFC_mean2
| heteroN 2 | heteroN 3 | heteroN 4 | heteroN 5 | |
|---|---|---|---|---|
| gene 1 | 0.195293 | 0.117458 | 0.060206 | 0.035730 |
| gene 2 | 0.109781 | 0.061713 | 0.036320 | 0.017077 |
| gene 3 | 0.205409 | 0.110640 | 0.062788 | 0.041490 |
| gene 4 | 0.118651 | 0.065660 | 0.039600 | 0.036984 |
| gene 5 | 0.000018 | 0.104591 | 0.074401 | 0.051537 |
| ... | ... | ... | ... | ... |
| gene 19994 | 0.000023 | 0.000037 | 0.000103 | 0.000031 |
| gene 19995 | 0.000005 | 0.000015 | 0.000020 | 0.000000 |
| gene 19996 | 0.000019 | 0.000005 | 0.000106 | 0.000000 |
| gene 19997 | 0.000003 | 0.000000 | 0.000032 | 0.000000 |
| gene 19998 | 0.000167 | 0.000000 | 0.000000 | 0.000000 |
19998 rows × 4 columns
# place blank dataframe
df_gene1_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene1_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene2_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene2_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene3_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene3_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene4_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene4_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene5_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene5_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene6_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene6_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene7_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene7_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene8_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene8_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene9_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene9_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene10_mean = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
df_gene10_std = pd.DataFrame(index=df_importance_SFC_mean2.columns, columns=['GC', 'SFC'])
# data insertion
df_gene1_mean['GC']=df_importance_GC_mean2.loc['gene 1',:]
df_gene1_std['GC']=df_importance_GC_std2.loc['gene 1',:]
df_gene1_mean['SFC']=df_importance_SFC_mean2.loc['gene 1',:]
df_gene1_std['SFC']=df_importance_SFC_std2.loc['gene 1',:]
df_gene2_mean['GC']=df_importance_GC_mean2.loc['gene 2',:]
df_gene2_std['GC']=df_importance_GC_std2.loc['gene 2',:]
df_gene2_mean['SFC']=df_importance_SFC_mean2.loc['gene 2',:]
df_gene2_std['SFC']=df_importance_SFC_std2.loc['gene 2',:]
df_gene3_mean['GC']=df_importance_GC_mean2.loc['gene 3',:]
df_gene3_std['GC']=df_importance_GC_std2.loc['gene 3',:]
df_gene3_mean['SFC']=df_importance_SFC_mean2.loc['gene 3',:]
df_gene3_std['SFC']=df_importance_SFC_std2.loc['gene 3',:]
df_gene4_mean['GC']=df_importance_GC_mean2.loc['gene 4',:]
df_gene4_std['GC']=df_importance_GC_std2.loc['gene 4',:]
df_gene4_mean['SFC']=df_importance_SFC_mean2.loc['gene 4',:]
df_gene4_std['SFC']=df_importance_SFC_std2.loc['gene 4',:]
df_gene5_mean['GC']=df_importance_GC_mean2.loc['gene 5',:]
df_gene5_std['GC']=df_importance_GC_std2.loc['gene 5',:]
df_gene5_mean['SFC']=df_importance_SFC_mean2.loc['gene 5',:]
df_gene5_std['SFC']=df_importance_SFC_std2.loc['gene 5',:]
df_gene6_mean['GC']=df_importance_GC_mean2.loc['gene 6',:]
df_gene6_std['GC']=df_importance_GC_std2.loc['gene 6',:]
df_gene6_mean['SFC']=df_importance_SFC_mean2.loc['gene 6',:]
df_gene6_std['SFC']=df_importance_SFC_std2.loc['gene 6',:]
df_gene7_mean['GC']=df_importance_GC_mean2.loc['gene 7',:]
df_gene7_std['GC']=df_importance_GC_std2.loc['gene 7',:]
df_gene7_mean['SFC']=df_importance_SFC_mean2.loc['gene 7',:]
df_gene7_std['SFC']=df_importance_SFC_std2.loc['gene 7',:]
df_gene8_mean['GC']=df_importance_GC_mean2.loc['gene 8',:]
df_gene8_std['GC']=df_importance_GC_std2.loc['gene 8',:]
df_gene8_mean['SFC']=df_importance_SFC_mean2.loc['gene 8',:]
df_gene8_std['SFC']=df_importance_SFC_std2.loc['gene 8',:]
df_gene9_mean['GC']=df_importance_GC_mean2.loc['gene 9',:]
df_gene9_std['GC']=df_importance_GC_std2.loc['gene 9',:]
df_gene9_mean['SFC']=df_importance_SFC_mean2.loc['gene 9',:]
df_gene9_std['SFC']=df_importance_SFC_std2.loc['gene 9',:]
df_gene10_mean['GC']=df_importance_GC_mean2.loc['gene 10',:]
df_gene10_std['GC']=df_importance_GC_std2.loc['gene 10',:]
df_gene10_mean['SFC']=df_importance_SFC_mean2.loc['gene 10',:]
df_gene10_std['SFC']=df_importance_SFC_std2.loc['gene 10',:]
# plot
fig, ax = plt.subplots(5,2, figsize=(10,15))
fig.subplots_adjust(hspace=0.8, wspace=0.5)
df_gene1_mean.plot.bar(ax=ax[0,0],yerr=df_gene1_std,capsize = 2., title='gene 1', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene2_mean.plot.bar(ax=ax[0,1],yerr=df_gene2_std,capsize = 2., title='gene 2', ylim=([0, 0.3]))
df_gene3_mean.plot.bar(ax=ax[1,0],yerr=df_gene3_std,capsize = 2., title='gene 3', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene4_mean.plot.bar(ax=ax[1,1],yerr=df_gene4_std,capsize = 2., title='gene 4', ylim=([0, 0.3]))
df_gene5_mean.plot.bar(ax=ax[2,0],yerr=df_gene5_std,capsize = 2., title='gene 5', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene6_mean.plot.bar(ax=ax[2,1],yerr=df_gene6_std,capsize = 2., title='gene 6', ylim=([0, 0.3]))
df_gene7_mean.plot.bar(ax=ax[3,0],yerr=df_gene7_std,capsize = 2., title='gene 7', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene8_mean.plot.bar(ax=ax[3,1],yerr=df_gene8_std,capsize = 2., title='gene 8', ylim=([0, 0.3]))
df_gene9_mean.plot.bar(ax=ax[4,0],yerr=df_gene9_std,capsize = 2., title='gene 9', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene10_mean.plot.bar(ax=ax[4,1],yerr=df_gene10_std,capsize = 2., title='gene 10', ylim=([0, 0.3]))
plt.show()
sns.set_palette(['#000000', '#ABABAB'])
# place blank dataframe
# plot
fig, ax = plt.subplots(5,2, figsize=(10,15))
fig.subplots_adjust(hspace=0.8, wspace=0.5)
df_gene1_mean.plot.bar(ax=ax[0,0],yerr=df_gene1_std,capsize = 2., title='gene 1', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene2_mean.plot.bar(ax=ax[0,1],yerr=df_gene2_std,capsize = 2., title='gene 2', ylim=([0, 0.3]))
df_gene3_mean.plot.bar(ax=ax[1,0],yerr=df_gene3_std,capsize = 2., title='gene 3', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene4_mean.plot.bar(ax=ax[1,1],yerr=df_gene4_std,capsize = 2., title='gene 4', ylim=([0, 0.3]))
df_gene5_mean.plot.bar(ax=ax[2,0],yerr=df_gene5_std,capsize = 2., title='gene 5', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene6_mean.plot.bar(ax=ax[2,1],yerr=df_gene6_std,capsize = 2., title='gene 6', ylim=([0, 0.3]))
df_gene7_mean.plot.bar(ax=ax[3,0],yerr=df_gene7_std,capsize = 2., title='gene 7', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene8_mean.plot.bar(ax=ax[3,1],yerr=df_gene8_std,capsize = 2., title='gene 8', ylim=([0, 0.3]))
df_gene9_mean.plot.bar(ax=ax[4,0],yerr=df_gene9_std,capsize = 2., title='gene 9', ylabel='feature inportance ', ylim=([0, 0.3]))
df_gene10_mean.plot.bar(ax=ax[4,1],yerr=df_gene10_std,capsize = 2., title='gene 10', ylim=([0, 0.3]))
plt.show()
sns.set_palette('bright')
heterogeneity_list
fig,ax=plt.subplots(1,len(heterogeneity_list), figsize=(14,3))
fig.subplots_adjust(hspace=1.0, wspace=0.5)
for k, hetN in enumerate(heterogeneity_list):
df_importance_Diff_mean2.loc['gene 1':'gene 10',f"heteroN {hetN}"].T.plot.bar(ax=ax[k],yerr=df_importance_Diff_std2.loc['gene 1':'gene 10',f"heteroN {hetN}"].T,capsize = 2.,
title= f"No. of subpoulation: {hetN}",
ylabel='$\Delta$feature inportance'if k==0 else None )